Initialization and import of data
install.packages("VGAM")
essai de l'URL 'https://cran.rstudio.com/bin/macosx/contrib/4.0/VGAM_1.1-5.tgz'
Content type 'application/x-gzip' length 7281930 bytes (6.9 MB)
==================================================
downloaded 6.9 MB
The downloaded binary packages are in
/var/folders/6g/jj4zrn8n0pv9nvlgqdvg6k500000gn/T//RtmpT1znVe/downloaded_packages
names(dataSet)
[1] "battery_power" "blue" "clock_speed" "dual_sim" "fc" "four_g" "int_memory" "m_dep" "mobile_wt" "n_cores" "pc" "px_height" "px_width" "ram"
[15] "sc_h" "sc_w" "talk_time" "three_g" "touch_screen" "wifi" "price_range"
Display of the set of data.
Columns : battery_power, blue, clock_speed, dual_sim, fc, four_g,int_memory, m_dep, mobile_wt, n_cores, pc, px_height, px_width, ram, sc_h, sc_w, talk_time, three_g, touch_screen, wifi, price_range
battery_power:Total energy a battery can store in one time measured in mAh blue:Has bluetooth or not clock_speed:speed at which microprocessor executes instructions dual_sim:Has dual sim support or not fc:Front Camera mega pixels four_g:Has 4G or not int_memory:Internal Memory in Gigabytes m_dep:Mobile Depth in cm mobile_wt:Weight of mobile phone n_cores:Number of cores of processor pc:Primary Camera mega pixels px_height:Pixel Resolution Height px_width:Pixel Resolution Width ram:Random Access Memory in Megabytes sc_h:Screen Height of mobile in cm sc_w:Screen Width of mobile in cm talk_time:longest time that a single battery charge will last when you are three_g:Has 3G or not touch_screen:Has touch screen or not wifi:Has wifi or not price_range: This is the target variable with value of 0(low cost), 1(medium cost), 2(high cost) and 3(very high cost).
dim(dataSet)
[1] 2000 21
class(dataSet)
[1] "data.frame"
head(dataSet)
sapply(dataSet, class)
battery_power blue clock_speed dual_sim fc four_g int_memory m_dep mobile_wt n_cores pc px_height px_width ram sc_h sc_w talk_time
"integer" "integer" "numeric" "integer" "integer" "integer" "integer" "numeric" "integer" "integer" "integer" "integer" "integer" "integer" "integer" "integer" "integer"
three_g touch_screen wifi price_range
"integer" "integer" "integer" "integer"
summary(dataSet)
battery_power blue clock_speed dual_sim fc four_g int_memory m_dep mobile_wt n_cores pc px_height px_width ram
Min. : 501.0 Min. :0.000 Min. :0.500 Min. :0.0000 Min. : 0.000 Min. :0.0000 Min. : 2.00 Min. :0.1000 Min. : 80.0 Min. :1.000 Min. : 0.000 Min. : 0.0 Min. : 500.0 Min. : 256
1st Qu.: 851.8 1st Qu.:0.000 1st Qu.:0.700 1st Qu.:0.0000 1st Qu.: 1.000 1st Qu.:0.0000 1st Qu.:16.00 1st Qu.:0.2000 1st Qu.:109.0 1st Qu.:3.000 1st Qu.: 5.000 1st Qu.: 282.8 1st Qu.: 874.8 1st Qu.:1208
Median :1226.0 Median :0.000 Median :1.500 Median :1.0000 Median : 3.000 Median :1.0000 Median :32.00 Median :0.5000 Median :141.0 Median :4.000 Median :10.000 Median : 564.0 Median :1247.0 Median :2146
Mean :1238.5 Mean :0.495 Mean :1.522 Mean :0.5095 Mean : 4.309 Mean :0.5215 Mean :32.05 Mean :0.5018 Mean :140.2 Mean :4.521 Mean : 9.916 Mean : 645.1 Mean :1251.5 Mean :2124
3rd Qu.:1615.2 3rd Qu.:1.000 3rd Qu.:2.200 3rd Qu.:1.0000 3rd Qu.: 7.000 3rd Qu.:1.0000 3rd Qu.:48.00 3rd Qu.:0.8000 3rd Qu.:170.0 3rd Qu.:7.000 3rd Qu.:15.000 3rd Qu.: 947.2 3rd Qu.:1633.0 3rd Qu.:3064
Max. :1998.0 Max. :1.000 Max. :3.000 Max. :1.0000 Max. :19.000 Max. :1.0000 Max. :64.00 Max. :1.0000 Max. :200.0 Max. :8.000 Max. :20.000 Max. :1960.0 Max. :1998.0 Max. :3998
sc_h sc_w talk_time three_g touch_screen wifi price_range
Min. : 5.00 Min. : 0.000 Min. : 2.00 Min. :0.0000 Min. :0.000 Min. :0.000 Min. :0.00
1st Qu.: 9.00 1st Qu.: 2.000 1st Qu.: 6.00 1st Qu.:1.0000 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.75
Median :12.00 Median : 5.000 Median :11.00 Median :1.0000 Median :1.000 Median :1.000 Median :1.50
Mean :12.31 Mean : 5.767 Mean :11.01 Mean :0.7615 Mean :0.503 Mean :0.507 Mean :1.50
3rd Qu.:16.00 3rd Qu.: 9.000 3rd Qu.:16.00 3rd Qu.:1.0000 3rd Qu.:1.000 3rd Qu.:1.000 3rd Qu.:2.25
Max. :19.00 Max. :18.000 Max. :20.00 Max. :1.0000 Max. :1.000 Max. :1.000 Max. :3.00
library(ggplot2)
df <- data.frame(
group = c(0, 1, 2, 3),
value = c(sum(dataSet$price_range==0), sum(dataSet$price_range==1), sum(dataSet$price_range==2), sum(dataSet$price_range==3))
)
bp<- ggplot(df, aes(x="", y=value, fill=group))+
geom_bar(width = 1, stat = "identity")
bp
pie <- bp + coord_polar("y", start=0)
pie
fig(18, 16)
Correlation plot showing the features that are the most linked between each others
library(ggcorrplot)
corr <- round(cor(dataSet), 8)
ggcorrplot(corr)
fig(18, 16)
str(dataSet)
'data.frame': 2000 obs. of 21 variables:
$ battery_power: int 842 1021 563 615 1821 1859 1821 1954 1445 509 ...
$ blue : int 0 1 1 1 1 0 0 0 1 1 ...
$ clock_speed : num 2.2 0.5 0.5 2.5 1.2 0.5 1.7 0.5 0.5 0.6 ...
$ dual_sim : int 0 1 1 0 0 1 0 1 0 1 ...
$ fc : int 1 0 2 0 13 3 4 0 0 2 ...
$ four_g : int 0 1 1 0 1 0 1 0 0 1 ...
$ int_memory : int 7 53 41 10 44 22 10 24 53 9 ...
$ m_dep : num 0.6 0.7 0.9 0.8 0.6 0.7 0.8 0.8 0.7 0.1 ...
$ mobile_wt : int 188 136 145 131 141 164 139 187 174 93 ...
$ n_cores : int 2 3 5 6 2 1 8 4 7 5 ...
$ pc : int 2 6 6 9 14 7 10 0 14 15 ...
$ px_height : int 20 905 1263 1216 1208 1004 381 512 386 1137 ...
$ px_width : int 756 1988 1716 1786 1212 1654 1018 1149 836 1224 ...
$ ram : int 2549 2631 2603 2769 1411 1067 3220 700 1099 513 ...
$ sc_h : int 9 17 11 16 8 17 13 16 17 19 ...
$ sc_w : int 7 3 2 8 2 1 8 3 1 10 ...
$ talk_time : int 19 7 9 11 15 10 18 5 20 12 ...
$ three_g : int 0 1 1 1 1 1 1 1 1 1 ...
$ touch_screen : int 0 1 1 0 1 0 0 1 0 0 ...
$ wifi : int 1 0 0 0 0 0 1 1 0 0 ...
$ price_range : int 1 2 2 2 1 1 3 0 0 0 ...
Displaying the cell percentages of different features of the dataSet
prop.table(table(dataSet$blue)) # cell percentages
0 1
0.505 0.495
prop.table(table(dataSet$dual_sim)) # cell percentages
0 1
0.4905 0.5095
prop.table(table(dataSet$four_g)) # cell percentages
0 1
0.4785 0.5215
prop.table(table(dataSet$three_g)) # cell percentages
0 1
0.2385 0.7615
prop.table(table(dataSet$touch_screen)) # cell percentages
0 1
0.497 0.503
prop.table(table(dataSet$wifi)) # cell percentages
0 1
0.493 0.507
Subplots using filtered dataSet showing the impact of the screen height and width on the mobile’s price range
library(ggplot2)
data = data.frame(Dimensions_in_cm = c(dataSet$sc_h, dataSet$sc_w),
Screen = rep(c("Height", "Width"), c(length(dataSet$sc_h), length(dataSet$sc_w))))
ggplot(data, aes(Dimensions_in_cm, fill = Screen)) +
geom_bar(position = 'identity', alpha = .6)
Comparing the impact of two features on the mobile’s price range
library(ggplot2)
library(gridExtra)
dataSet$price_range <- as.factor(dataSet$price_range)
p1 <- ggplot(dataSet, aes(x=px_width, y = px_height, color=price_range)) +
geom_boxplot(outlier.colour="red", outlier.shape=8,
outlier.size=4) +
labs(title = "Pixel Resolution Height vs Pixel Resolution Width")
p2 <- ggplot(dataSet, aes(x=price_range, y = ram, color=price_range)) +
geom_boxplot(outlier.colour="red", outlier.shape=8,
outlier.size=4) +
labs(title = "RAM vs Price Range")
grid.arrange(p1, p2,nrow = 1)
fig(24, 20)
Comparing the impact of two features on the mobile’s price range
library(ggplot2)
library(gridExtra)
dataSet$price_range <- as.factor(dataSet$price_range)
p3 <- ggplot(dataSet, aes(x=price_range, y = int_memory, color=price_range)) +
geom_boxplot(outlier.colour="red", outlier.shape=8,
outlier.size=4) +
labs(title = "int_memory vs Price Range")
p4 <- ggplot(dataSet, aes(x=price_range, y = battery_power, color=price_range)) +
geom_boxplot(outlier.colour="red", outlier.shape=8,
outlier.size=4) +
labs(title = "Battery power vs Price Range")
grid.arrange(p3, p4,nrow = 1)
fig(24, 20)
Interquartile range of two features at a time
library(dplyr)
library(ggplot2)
library(gridExtra)
#Battery_power IQR
firstQuantile <- quantile(dataSet$battery_power, 0.25)
thirdQuantile <- quantile(dataSet$battery_power, 0.75)
battery_powerIQR <- dataSet %>% filter(dataSet$battery_power > firstQuantile & dataSet$battery_power < thirdQuantile)
#Int Memory IQR
firstQuantile <- quantile(dataSet$int_memory , 0.25)
thirdQuantile <- quantile(dataSet$int_memory, 0.75)
int_memoryIQR <- dataSet %>% filter(dataSet$int_memory > firstQuantile & dataSet$int_memory < thirdQuantile)
#ram IQR
firstQuantile <- quantile(dataSet$ram , 0.25)
thirdQuantile <- quantile(dataSet$ram, 0.75)
ramIQR <- dataSet %>% filter(dataSet$ram > firstQuantile & dataSet$ram < thirdQuantile)
p2 <- ggplot(ramIQR, aes(x=price_range, y = ram, color=price_range)) +
geom_boxplot(outlier.colour="red", outlier.shape=8,
outlier.size=4) +
labs(title = "IQR RAM vs Price Range")
p3 <- ggplot(int_memoryIQR, aes(x=price_range, y = int_memory, color=price_range)) +
geom_boxplot(outlier.colour="red", outlier.shape=8,
outlier.size=4) +
labs(title = "IQR int memory vs Price Range")
p4 <- ggplot(battery_powerIQR, aes(x=price_range, y = battery_power, color=price_range)) +
geom_boxplot(outlier.colour="red", outlier.shape=8,
outlier.size=4) +
labs(title = "IQR battery power vs Price Range")
grid.arrange(p2, p3, p4,nrow = 1)
fig(24, 20)
Data classification depending on battery power, ram, memory, clock speed and number of cores
install.packages("pacman")
essai de l'URL 'https://cran.rstudio.com/bin/macosx/contrib/4.0/pacman_0.5.1.tgz'
Content type 'application/x-gzip' length 379950 bytes (371 KB)
==================================================
downloaded 371 KB
The downloaded binary packages are in
/var/folders/6g/jj4zrn8n0pv9nvlgqdvg6k500000gn/T//RtmpT1znVe/downloaded_packages
library(ggplot2)
install.packages("dplyr")
essai de l'URL 'https://cran.rstudio.com/bin/macosx/contrib/4.0/dplyr_1.0.5.tgz'
Content type 'application/x-gzip' length 1251016 bytes (1.2 MB)
==================================================
downloaded 1.2 MB
The downloaded binary packages are in
/var/folders/6g/jj4zrn8n0pv9nvlgqdvg6k500000gn/T//RtmpT1znVe/downloaded_packages
p <- ggplot(dataSet, aes(battery_power, ram, color = price_range))+
geom_point()
p + stat_ellipse()
p <- ggplot(dataSet, aes(int_memory, ram, color = price_range))+
geom_point()
p + stat_ellipse()
3D graphic
library(tidyverse)
library(plotly)
Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
Attachement du package : ‘plotly’
The following object is masked from ‘package:rio’:
export
The following object is masked from ‘package:ggplot2’:
last_plot
The following object is masked from ‘package:stats’:
filter
The following object is masked from ‘package:graphics’:
layout
# Creating the graphic
p <- plot_ly(
dataSet, x = dataSet$battery_power, y = dataSet$ram, z = dataSet$int_memory,
color = dataSet$price_range) %>%
add_markers(size=1) %>%
layout(
scene = list(xaxis = list(title = 'Battery Power'),
yaxis = list(title = 'Ram'),
zaxis = list(title = 'Memoire interne'))
)
p
print(paste0(k, paste0(" nbFold, Score F : ", score)))
[1] "10 nbFold, Score F : 0.937552039966694"